Last updated: 2015-10-24
Code version: 4abacbabb75c62d74de567b85e4503d4985f2061
Previously, we compared normalized coefficient of variations across individuals. Here, we will also compare the mean gene expression across individuals, using single cell sequencing data and bulk RNA-seq data.
It would be interesting to learn the possible overlap or non-overlap between the genes that we observed significant individual differences in coefficient of variations versus those different in mean gene expression levels across cells.
From here on we will work with the filtered data without NA19098.r2.
library("data.table")
library("dplyr")
library("limma")
library("edgeR")
library("ggplot2")
library("grid")
theme_set(theme_bw(base_size = 12))
source("functions.R")
Input annotation of only QC-filtered single cells. Remove NA19098.r2
anno_qc <- read.table("../data/annotation-filter.txt", header = TRUE,
stringsAsFactors = FALSE)
is_include <- anno_qc$batch != "NA19098.r2"
anno_qc_filter <- anno_qc[which(is_include), ]
Import endogeneous gene molecule counts that are QC-filtered, CPM-normalized, ERCC-normalized, and also processed to remove unwanted variation from batch effet. ERCC genes are removed from this file.
molecules_ENSG <- read.table("../data/molecules-final.txt", header = TRUE, stringsAsFactors = FALSE)
molecules_ENSG <- molecules_ENSG[ , is_include]
Input moleclule counts before log2 CPM transformation. This file is used to compute percent zero-count cells per sample.
molecules_sparse <- read.table("../data/molecules-filter.txt", header = TRUE, stringsAsFactors = FALSE)
molecules_sparse <- molecules_sparse[grep("ENSG", rownames(molecules_sparse)), ]
stopifnot( all.equal(rownames(molecules_ENSG), rownames(molecules_sparse)) )
Compute coefficient of variation.
# Compute CV and mean of normalized molecule counts (take 2^(log2-normalized count))
molecules_cv_batch_ENSG <-
lapply(1:length(unique(anno_qc_filter$batch)), function(per_batch) {
molecules_per_batch <- 2^molecules_ENSG[ , unique(anno_qc_filter$batch) == unique(anno_qc_filter$batch)[per_batch] ]
mean_per_gene <- apply(molecules_per_batch, 1, mean, na.rm = TRUE)
sd_per_gene <- apply(molecules_per_batch, 1, sd, na.rm = TRUE)
cv_per_gene <- data.frame(mean = mean_per_gene,
sd = sd_per_gene,
cv = sd_per_gene/mean_per_gene)
rownames(cv_per_gene) <- rownames(molecules_ENSG)
# cv_per_gene <- cv_per_gene[rowSums(is.na(cv_per_gene)) == 0, ]
cv_per_gene$batch <- unique(anno_qc_filter$batch)[per_batch]
# Add sparsity percent
molecules_sparse_per_batch <- molecules_sparse[ , unique(anno_qc_filter$batch) == unique(anno_qc_filter$batch)[per_batch]]
cv_per_gene$sparse <- rowMeans(as.matrix(molecules_sparse_per_batch) == 0)
return(cv_per_gene)
})
names(molecules_cv_batch_ENSG) <- unique(anno_qc_filter$batch)
Merge summary data.frames.
df_ENSG <- do.call(rbind, molecules_cv_batch_ENSG)
Compute rolling medians across all samples.
library(zoo)
# Compute a data-wide coefficient of variation on CPM normalized counts.
data_cv_ENSG <- apply(2^molecules_ENSG, 1, sd)/apply(2^molecules_ENSG, 1, mean)
# Order of genes by mean expression levels
order_gene <- order(apply(2^molecules_ENSG, 1, mean))
# Rolling medians of log10 squared CV by mean expression levels
roll_medians <- rollapply(log10(data_cv_ENSG^2)[order_gene], width = 50, by = 25,
FUN = median, fill = list("extend", "extend", "NA") )
ii_na <- which( is.na(roll_medians) )
roll_medians[ii_na] <- median( log10(data_cv_ENSG^2)[order_gene][ii_na] )
names(roll_medians) <- rownames(molecules_ENSG)[order_gene]
# re-order rolling medians
reorder_gene <- match(rownames(molecules_ENSG), names(roll_medians) )
roll_medians <- roll_medians[ reorder_gene ]
stopifnot( all.equal(names(roll_medians), rownames(molecules_ENSG) ) )
Compute adjusted coefficient of variation.
# adjusted coefficient of variation on log10 scale
log10cv2_adj_ENSG <-
lapply(1:length(molecules_cv_batch_ENSG), function(per_batch) {
foo <- log10(molecules_cv_batch_ENSG[[per_batch]]$cv^2) - roll_medians
return(foo)
})
df_ENSG$log10cv2_adj_ENSG <- do.call(c, log10cv2_adj_ENSG)
library(limma)
df_limma <- matrix(df_ENSG$log10cv2_adj_ENSG,
nrow = nrow(molecules_ENSG), ncol = 8, byrow = FALSE)
design <- data.frame(individual = factor(rep(unique(anno_qc_filter$individual), each = 3) ),
rep = factor(rep(c(1:3), times = 3)) )
design <- design[ with(design, !(individual == "NA19098" & rep == "2")), ]
colnames(df_limma) <- with(design, paste0(individual, rep))
fit_limma <- lmFit(df_limma, design = model.matrix( ~ individual, data = design))
fit_limma <- eBayes(fit_limma)
False discovery control adjustment.
F.p.adj <- p.adjust(fit_limma$F.p.value, method = "fdr")
Cutoffs
df_cuts <- data.frame(cuts = c(.001, .01, .05, .1, .15, .2))
df_cuts$sig_count <- sapply(1:6, function(per_cut) {
sum(F.p.adj < df_cuts$cuts[per_cut] )
})
df_cuts
cuts sig_count
1 0.001 18
2 0.010 42
3 0.050 103
4 0.100 177
5 0.150 328
6 0.200 464
False discovery control adjutment.
F.p.adj <- p.adjust(fit_limma$F.p.value, method = "fdr")
We plotted out per gene average CVs across samples versus average expression level for the endogeneous genes, and identified the genes classifed as significantly different between individuals in coefficients of variation.
df_compare <-
data.frame(mean = rowMeans( as.matrix(
do.call(cbind, lapply(molecules_cv_batch_ENSG, "[[", 1) ) ) ),
cv2 = rowMeans( as.matrix(
do.call(cbind, lapply(molecules_cv_batch_ENSG, "[[", 3) ) )^2 ),
adj_cv2 = rowMeans( 10^as.matrix(
do.call(cbind, log10cv2_adj_ENSG) ) ) )
library(broman)
crayon <- brocolors("crayons")
par(mar=c(5,5,3,1))
with(df_compare, plot(x = log10(mean), y = cv2, pch = 1, cex = 1, col = "grey50",
lwd = .5,
ylab = "average squared coeffcient of variations \n across samples",
xlab = "log10 average of mean CPM across samples") )
with(df_compare[F.p.adj < .2, ], points(x = log10(mean), y = cv2, pch = 16, cex = .6,
col = crayon["Tumbleweed"]) )
with(df_compare[F.p.adj < .1, ], points(x = log10(mean), y = cv2, pch = 16, cex = .6,
col = crayon["Orange"]) )
with(df_compare[F.p.adj < .01, ], points(x = log10(mean), y = cv2, pch = 16, cex = .6,
col = crayon["Scarlet"]) )
title(main = "Avg. Squred CV vs. log10(Avg. mean count)")
legend("topright", pch = c(1, 16, 16, 16),
legend = c("All genes", "Adj p-value < .2",
"Adj p-value < .1", "Adj p-value < .01"),
col = c("grey50", crayon[c("Tumbleweed", "Orange", "Scarlet")]),
bty = "n")
par(mar=c(5,5,3,1))
with(df_compare, plot(x = log10(mean), y = adj_cv2, pch = 1, cex = 1, col = "grey50",
lwd = .5,
ylab = "average squared coeffcient of variations \n across samples",
xlab = "log10 average of mean CPM across samples") )
with(df_compare[F.p.adj < .2, ], points(x = log10(mean), y = adj_cv2, pch = 16, cex = .6,
col = crayon["Tumbleweed"]) )
with(df_compare[F.p.adj < .1, ], points(x = log10(mean), y = adj_cv2, pch = 16, cex = .6,
col = crayon["Orange"]) )
with(df_compare[F.p.adj < .01, ], points(x = log10(mean), y = adj_cv2, pch = 16, cex = .6,
col = crayon["Scarlet"]) )
title(main = "Avg. Adjusted Squred CV vs. log10(Avg. mean count)")
legend("topright", pch = c(1, 16, 16, 16),
legend = c("All genes", "Adj p-value < .2",
"Adj p-value < .1", "Adj p-value < .01"),
col = c("grey50", crayon[c("Tumbleweed", "Orange", "Scarlet")]),
bty = "n")
Our naive analysis seems to pick up genes with different density between individuals.
library(gridExtra)
order_low_cv <- order(df_compare[F.p.adj < .01, ]$cv)
low_plots <- lapply(1:6, function(i) {
ind <- order_low_cv[i]
ggplot(data.frame(values = unlist(molecules_ENSG[ind, ]),
individual = factor(anno_qc_filter$individual),
replicate = factor(anno_qc_filter$replicate),
batch = factor(anno_qc_filter$batch), check.rows = F),
aes(x = values)) +
geom_density(aes(group = batch, col = individual)) +
ggtitle(paste(rownames(molecules_ENSG)[ind],
";CV = ", round(df_compare[F.p.adj < .01, ]$cv[ind], 2) ) )
})
grid.arrange(grobs = low_plots, ncol = 2)
order_high_cv <- order(df_compare[F.p.adj < .01, ]$cv, decreasing = TRUE)
high_plots <- lapply(1:6, function(i) {
ind <- order_high_cv[i]
ggplot(data.frame(values = unlist(molecules_ENSG[ind, ]),
individual = factor(anno_qc_filter$individual),
replicate = factor(anno_qc_filter$replicate),
batch = factor(anno_qc_filter$batch), check.rows = F),
aes(x = values)) +
geom_density(aes(group = batch, col = individual)) +
ggtitle(paste(rownames(molecules_ENSG)[ind],
";CV = ", round(df_compare[F.p.adj < .01, ]$cv[ind], 2) ) )
})
grid.arrange(grobs = high_plots, ncol = 2)
sig_cv <- order(F.p.adj)[F.p.adj < .05]
library("biomaRt")
ensembl <- useMart(host = "grch37.ensembl.org",
biomart = "ENSEMBL_MART_ENSEMBL",
dataset = "hsapiens_gene_ensembl")
differential_CV_genes_info <- getBM(attributes = c("ensembl_gene_id", "chromosome_name",
"external_gene_name", "transcript_count",
"description"),
filters = "ensembl_gene_id",
values = rownames(molecules_ENSG[sig_cv, ]),
mart = ensembl)
differential_CV_genes_info
ensembl_gene_id chromosome_name external_gene_name transcript_count
1 ENSG00000013288 4 MAN2B2 5
2 ENSG00000013375 6 PGM3 14
3 ENSG00000036549 1 ZZZ3 11
4 ENSG00000048471 16 SNX29 10
5 ENSG00000068400 X GRIPAP1 9
6 ENSG00000070669 7 ASNS 17
7 ENSG00000073910 13 FRY 7
8 ENSG00000074590 12 NUAK1 4
9 ENSG00000075240 22 GRAMD4 7
10 ENSG00000079819 6 EPB41L2 31
11 ENSG00000089195 20 TRMT6 5
12 ENSG00000092820 6 EZR 4
13 ENSG00000100564 14 PIGH 12
14 ENSG00000100968 14 NFATC4 33
15 ENSG00000101945 X SUV39H1 5
16 ENSG00000102931 16 ARL2BP 4
17 ENSG00000103042 16 SLC38A7 16
18 ENSG00000103363 16 TCEB2 5
19 ENSG00000104872 19 PIH1D1 24
20 ENSG00000105229 19 PIAS4 6
21 ENSG00000106366 7 SERPINE1 2
22 ENSG00000106554 7 CHCHD3 9
23 ENSG00000108091 10 CCDC6 3
24 ENSG00000108479 17 GALK1 10
25 ENSG00000108799 17 EZH1 25
26 ENSG00000108840 17 HDAC5 13
27 ENSG00000110756 11 HPS5 11
28 ENSG00000112245 6 PTP4A1 5
29 ENSG00000112339 6 HBS1L 18
30 ENSG00000115808 2 STRN 3
31 ENSG00000115816 2 CEBPZ 3
32 ENSG00000118217 1 ATF6 2
33 ENSG00000124067 16 SLC12A4 20
34 ENSG00000125841 20 NRSN2 9
35 ENSG00000127952 7 STYXL1 11
36 ENSG00000130208 19 APOC1 10
37 ENSG00000131653 16 TRAF7 7
38 ENSG00000133424 22 LARGE 19
39 ENSG00000135063 9 FAM189A2 6
40 ENSG00000135250 7 SRPK2 15
41 ENSG00000135315 6 KIAA1009 6
42 ENSG00000136371 15 MTHFS 4
43 ENSG00000137413 6 TAF8 10
44 ENSG00000137710 11 RDX 18
45 ENSG00000138750 4 NUP54 20
46 ENSG00000139974 14 SLC38A6 21
47 ENSG00000140905 16 GCSH 6
48 ENSG00000142698 1 C1orf94 2
49 ENSG00000143622 1 RIT1 6
50 ENSG00000144120 2 TMEM177 6
51 ENSG00000145388 4 METTL14 4
52 ENSG00000149054 11 ZNF215 6
53 ENSG00000149179 11 C11orf49 24
54 ENSG00000149806 11 FAU 9
55 ENSG00000150459 13 SAP18 7
56 ENSG00000152359 5 POC5 15
57 ENSG00000153044 5 CENPH 6
58 ENSG00000153250 2 RBMS1 15
59 ENSG00000156011 8 PSD3 18
60 ENSG00000156531 X PHF6 6
61 ENSG00000158545 16 ZC3H18 12
62 ENSG00000161021 5 MAML1 4
63 ENSG00000162458 1 FBLIM1 16
64 ENSG00000163714 3 U2SURP 18
65 ENSG00000163950 4 SLBP 6
66 ENSG00000164074 4 C4orf29 5
67 ENSG00000165186 X PTCHD1 2
68 ENSG00000166822 16 TMEM170A 7
69 ENSG00000168038 3 ULK4 12
70 ENSG00000168610 17 STAT3 14
71 ENSG00000170471 20 RALGAPB 10
72 ENSG00000171466 19 ZNF562 12
73 ENSG00000171603 1 CLSTN1 6
74 ENSG00000171848 2 RRM2 10
75 ENSG00000172113 3 NME6 21
76 ENSG00000172534 X HCFC1 5
77 ENSG00000172831 16 CES2 12
78 ENSG00000173486 11 FKBP2 6
79 ENSG00000174238 17 PITPNA 11
80 ENSG00000174744 11 BRMS1 9
81 ENSG00000179195 12 ZNF664 14
82 ENSG00000180398 2 MCFD2 17
83 ENSG00000181038 17 METTL23 14
84 ENSG00000183150 12 GPR19 5
85 ENSG00000183684 17 ALYREF 5
86 ENSG00000183889 16 NPIPA7 13
87 ENSG00000185043 15 CIB1 1
88 ENSG00000185475 11 TMEM179B 5
89 ENSG00000185650 14 ZFP36L1 7
90 ENSG00000187664 19 HAPLN4 2
91 ENSG00000188636 22 LDOC1L 1
92 ENSG00000188643 1 S100A16 5
93 ENSG00000196267 19 ZNF836 6
94 ENSG00000197451 5 HNRNPAB 8
95 ENSG00000198513 14 ATL1 12
96 ENSG00000198633 19 ZNF534 4
97 ENSG00000204442 13 FAM155A 1
98 ENSG00000204469 6 PRRC2A 14
99 ENSG00000213903 14 LTB4R 5
100 ENSG00000215114 8 UBXN2B 5
101 ENSG00000219545 7 RPA3-AS1 7
102 ENSG00000257093 7 KIAA1147 2
103 ENSG00000270188 1 MTRNR2L11 1
description
1 mannosidase, alpha, class 2B, member 2 [Source:HGNC Symbol;Acc:29623]
2 phosphoglucomutase 3 [Source:HGNC Symbol;Acc:8907]
3 zinc finger, ZZ-type containing 3 [Source:HGNC Symbol;Acc:24523]
4 sorting nexin 29 [Source:HGNC Symbol;Acc:30542]
5 GRIP1 associated protein 1 [Source:HGNC Symbol;Acc:18706]
6 asparagine synthetase (glutamine-hydrolyzing) [Source:HGNC Symbol;Acc:753]
7 furry homolog (Drosophila) [Source:HGNC Symbol;Acc:20367]
8 NUAK family, SNF1-like kinase, 1 [Source:HGNC Symbol;Acc:14311]
9 GRAM domain containing 4 [Source:HGNC Symbol;Acc:29113]
10 erythrocyte membrane protein band 4.1-like 2 [Source:HGNC Symbol;Acc:3379]
11 tRNA methyltransferase 6 homolog (S. cerevisiae) [Source:HGNC Symbol;Acc:20900]
12 ezrin [Source:HGNC Symbol;Acc:12691]
13 phosphatidylinositol glycan anchor biosynthesis, class H [Source:HGNC Symbol;Acc:8964]
14 nuclear factor of activated T-cells, cytoplasmic, calcineurin-dependent 4 [Source:HGNC Symbol;Acc:7778]
15 suppressor of variegation 3-9 homolog 1 (Drosophila) [Source:HGNC Symbol;Acc:11479]
16 ADP-ribosylation factor-like 2 binding protein [Source:HGNC Symbol;Acc:17146]
17 solute carrier family 38, member 7 [Source:HGNC Symbol;Acc:25582]
18 transcription elongation factor B (SIII), polypeptide 2 (18kDa, elongin B) [Source:HGNC Symbol;Acc:11619]
19 PIH1 domain containing 1 [Source:HGNC Symbol;Acc:26075]
20 protein inhibitor of activated STAT, 4 [Source:HGNC Symbol;Acc:17002]
21 serpin peptidase inhibitor, clade E (nexin, plasminogen activator inhibitor type 1), member 1 [Source:HGNC Symbol;Acc:8583]
22 coiled-coil-helix-coiled-coil-helix domain containing 3 [Source:HGNC Symbol;Acc:21906]
23 coiled-coil domain containing 6 [Source:HGNC Symbol;Acc:18782]
24 galactokinase 1 [Source:HGNC Symbol;Acc:4118]
25 enhancer of zeste homolog 1 (Drosophila) [Source:HGNC Symbol;Acc:3526]
26 histone deacetylase 5 [Source:HGNC Symbol;Acc:14068]
27 Hermansky-Pudlak syndrome 5 [Source:HGNC Symbol;Acc:17022]
28 protein tyrosine phosphatase type IVA, member 1 [Source:HGNC Symbol;Acc:9634]
29 HBS1-like (S. cerevisiae) [Source:HGNC Symbol;Acc:4834]
30 striatin, calmodulin binding protein [Source:HGNC Symbol;Acc:11424]
31 CCAAT/enhancer binding protein (C/EBP), zeta [Source:HGNC Symbol;Acc:24218]
32 activating transcription factor 6 [Source:HGNC Symbol;Acc:791]
33 solute carrier family 12 (potassium/chloride transporter), member 4 [Source:HGNC Symbol;Acc:10913]
34 neurensin 2 [Source:HGNC Symbol;Acc:16229]
35 serine/threonine/tyrosine interacting-like 1 [Source:HGNC Symbol;Acc:18165]
36 apolipoprotein C-I [Source:HGNC Symbol;Acc:607]
37 TNF receptor-associated factor 7, E3 ubiquitin protein ligase [Source:HGNC Symbol;Acc:20456]
38 like-glycosyltransferase [Source:HGNC Symbol;Acc:6511]
39 family with sequence similarity 189, member A2 [Source:HGNC Symbol;Acc:24820]
40 SRSF protein kinase 2 [Source:HGNC Symbol;Acc:11306]
41 KIAA1009 [Source:HGNC Symbol;Acc:21107]
42 5,10-methenyltetrahydrofolate synthetase (5-formyltetrahydrofolate cyclo-ligase) [Source:HGNC Symbol;Acc:7437]
43 TAF8 RNA polymerase II, TATA box binding protein (TBP)-associated factor, 43kDa [Source:HGNC Symbol;Acc:17300]
44 radixin [Source:HGNC Symbol;Acc:9944]
45 nucleoporin 54kDa [Source:HGNC Symbol;Acc:17359]
46 solute carrier family 38, member 6 [Source:HGNC Symbol;Acc:19863]
47 glycine cleavage system protein H (aminomethyl carrier) [Source:HGNC Symbol;Acc:4208]
48 chromosome 1 open reading frame 94 [Source:HGNC Symbol;Acc:28250]
49 Ras-like without CAAX 1 [Source:HGNC Symbol;Acc:10023]
50 transmembrane protein 177 [Source:HGNC Symbol;Acc:28143]
51 methyltransferase like 14 [Source:HGNC Symbol;Acc:29330]
52 zinc finger protein 215 [Source:HGNC Symbol;Acc:13007]
53 chromosome 11 open reading frame 49 [Source:HGNC Symbol;Acc:28720]
54 Finkel-Biskis-Reilly murine sarcoma virus (FBR-MuSV) ubiquitously expressed [Source:HGNC Symbol;Acc:3597]
55 Sin3A-associated protein, 18kDa [Source:HGNC Symbol;Acc:10530]
56 POC5 centriolar protein [Source:HGNC Symbol;Acc:26658]
57 centromere protein H [Source:HGNC Symbol;Acc:17268]
58 RNA binding motif, single stranded interacting protein 1 [Source:HGNC Symbol;Acc:9907]
59 pleckstrin and Sec7 domain containing 3 [Source:HGNC Symbol;Acc:19093]
60 PHD finger protein 6 [Source:HGNC Symbol;Acc:18145]
61 zinc finger CCCH-type containing 18 [Source:HGNC Symbol;Acc:25091]
62 mastermind-like 1 (Drosophila) [Source:HGNC Symbol;Acc:13632]
63 filamin binding LIM protein 1 [Source:HGNC Symbol;Acc:24686]
64 U2 snRNP-associated SURP domain containing [Source:HGNC Symbol;Acc:30855]
65 stem-loop binding protein [Source:HGNC Symbol;Acc:10904]
66 chromosome 4 open reading frame 29 [Source:HGNC Symbol;Acc:26111]
67 patched domain containing 1 [Source:HGNC Symbol;Acc:26392]
68 transmembrane protein 170A [Source:HGNC Symbol;Acc:29577]
69 unc-51 like kinase 4 [Source:HGNC Symbol;Acc:15784]
70 signal transducer and activator of transcription 3 (acute-phase response factor) [Source:HGNC Symbol;Acc:11364]
71 Ral GTPase activating protein, beta subunit (non-catalytic) [Source:HGNC Symbol;Acc:29221]
72 zinc finger protein 562 [Source:HGNC Symbol;Acc:25950]
73 calsyntenin 1 [Source:HGNC Symbol;Acc:17447]
74 ribonucleotide reductase M2 [Source:HGNC Symbol;Acc:10452]
75 NME/NM23 nucleoside diphosphate kinase 6 [Source:HGNC Symbol;Acc:20567]
76 host cell factor C1 (VP16-accessory protein) [Source:HGNC Symbol;Acc:4839]
77 carboxylesterase 2 [Source:HGNC Symbol;Acc:1864]
78 FK506 binding protein 2, 13kDa [Source:HGNC Symbol;Acc:3718]
79 phosphatidylinositol transfer protein, alpha [Source:HGNC Symbol;Acc:9001]
80 breast cancer metastasis suppressor 1 [Source:HGNC Symbol;Acc:17262]
81 zinc finger protein 664 [Source:HGNC Symbol;Acc:25406]
82 multiple coagulation factor deficiency 2 [Source:HGNC Symbol;Acc:18451]
83 methyltransferase like 23 [Source:HGNC Symbol;Acc:26988]
84 G protein-coupled receptor 19 [Source:HGNC Symbol;Acc:4473]
85 Aly/REF export factor [Source:HGNC Symbol;Acc:19071]
86 Protein PKD1P1 [Source:UniProtKB/TrEMBL;Acc:H0YEP2]
87 calcium and integrin binding 1 (calmyrin) [Source:HGNC Symbol;Acc:16920]
88 transmembrane protein 179B [Source:HGNC Symbol;Acc:33744]
89 ZFP36 ring finger protein-like 1 [Source:HGNC Symbol;Acc:1107]
90 hyaluronan and proteoglycan link protein 4 [Source:HGNC Symbol;Acc:31357]
91 leucine zipper, down-regulated in cancer 1-like [Source:HGNC Symbol;Acc:13343]
92 S100 calcium binding protein A16 [Source:HGNC Symbol;Acc:20441]
93 zinc finger protein 836 [Source:HGNC Symbol;Acc:34333]
94 heterogeneous nuclear ribonucleoprotein A/B [Source:HGNC Symbol;Acc:5034]
95 atlastin GTPase 1 [Source:HGNC Symbol;Acc:11231]
96 zinc finger protein 534 [Source:HGNC Symbol;Acc:26337]
97 family with sequence similarity 155, member A [Source:HGNC Symbol;Acc:33877]
98 proline-rich coiled-coil 2A [Source:HGNC Symbol;Acc:13918]
99 leukotriene B4 receptor [Source:HGNC Symbol;Acc:6713]
100 UBX domain protein 2B [Source:HGNC Symbol;Acc:27035]
101 RPA3 antisense RNA 1 [Source:HGNC Symbol;Acc:48955]
102 KIAA1147 [Source:HGNC Symbol;Acc:29472]
103 MT-RNR2-like 11 (pseudogene) [Source:HGNC Symbol;Acc:37168]
Export significant genes.
write.table(rownames(molecules_ENSG[F.p.adj < .05, ]),
file = "../data/genelist-cv-sig.txt",
row.names = FALSE, col.names = FALSE, quote = FALSE)
Use string analysis data base to look for protein protein interaction of these differential CV genes.
This is the confidence view. Stronger associations are represented by thicker lines.
This is the evidence view. Different line colors represent the types of evidence for the association.
[Description and steps are copied from my work with Sidney]
GOstats performed Hypergeometric test for gene set enrichment. The test can take account into the conditional relationship between GO terms. Standard GOstats output contains the following information:
GO analysis workflow is consisted of two major steps. First, we identified GO terms significant enriched given a p-value cutoff, which we set to 1 so that all possible GO terms associated with the genes can be included. We do this to eliminate false negatives. Then, we take the union of the GO terms across functional categories of interest (e.g., translation attentuation and post-translational buffering). The GO terms that are significant in at least one of the four functional categories are presented in the results.
Note. The following heatmaps are generated based on the p-values of the gene sets.
if (file.exists("rda/cv-annotation/go-cv.rda")) {
load("rda/cv-annotation/go-cv.rda")
} else {
go_sig_cv <- GOtest(my_ensembl_gene_universe = rownames(molecules_ENSG),
my_ensembl_gene_test = rownames(molecules_ENSG)[F.p.adj < .05],
pval_cutoff = 1, ontology=c("BP","CC","MF") )
save(molecules_ENSG,
F.p.adj, go_sig_cv, file = "rda/cv-annotation/go-cv.rda")
}
Extract terms
if (file.exists("rda/cv-annotation/go-cv-terms.rda")) {
load("rda/cv-annotation/go-cv-terms.rda")
} else {
# Biological process
goterms_bp <- summary(go_sig_cv$GO$BP, pvalue = 1)
goterms_bp <- data.frame(ID = goterms_bp[[1]],
Pvalue = goterms_bp[[2]],
Terms = goterms_bp[[7]])
goterms_bp <- goterms_bp[order(goterms_bp$Pvalue), ]
# Cellular component
goterms_cc <- summary(go_sig_cv$GO$CC, pvalue = 1)
goterms_cc <- data.frame(ID = goterms_cc[[1]],
Pvalue = goterms_cc[[2]],
Terms = goterms_cc[[7]])
goterms_cc <- goterms_cc[order(goterms_cc$Pvalue), ]
# Molecular function
goterms_mf <- summary(go_sig_cv$GO$MF, pvalue = 1)
goterms_mf <- data.frame(ID = goterms_mf[[1]],
Pvalue = goterms_mf[[2]],
Terms = goterms_mf[[7]])
goterms_mf <- goterms_mf[order(goterms_mf$Pvalue), ]
save(goterms_bp, goterms_cc, goterms_mf,
file = "rda/cv-annotation/go-cv-terms.rda")
}
*Biological process
dim(goterms_bp)
[1] 1899 3
kable(goterms_bp[1:50, ])
| ID | Pvalue | Terms |
|---|---|---|
| GO:0071294 | 0.0000000 | cellular response to zinc ion |
| GO:0010043 | 0.0000000 | response to zinc ion |
| GO:0071276 | 0.0000000 | cellular response to cadmium ion |
| GO:0046686 | 0.0000043 | response to cadmium ion |
| GO:0071248 | 0.0000050 | cellular response to metal ion |
| GO:1990267 | 0.0000079 | response to transition metal nanoparticle |
| GO:0071241 | 0.0000120 | cellular response to inorganic substance |
| GO:0045926 | 0.0000139 | negative regulation of growth |
| GO:0010950 | 0.0001659 | positive regulation of endopeptidase activity |
| GO:0010952 | 0.0003076 | positive regulation of peptidase activity |
| GO:0036017 | 0.0004702 | response to erythropoietin |
| GO:0036018 | 0.0004702 | cellular response to erythropoietin |
| GO:0045091 | 0.0004702 | regulation of single stranded viral RNA replication via double stranded DNA intermediate |
| GO:0043280 | 0.0007251 | positive regulation of cysteine-type endopeptidase activity involved in apoptotic process |
| GO:0040008 | 0.0008488 | regulation of growth |
| GO:2001056 | 0.0009056 | positive regulation of cysteine-type endopeptidase activity |
| GO:0039692 | 0.0011619 | single stranded viral RNA replication via double stranded DNA intermediate |
| GO:0071371 | 0.0011619 | cellular response to gonadotropin stimulus |
| GO:0009611 | 0.0012159 | response to wounding |
| GO:0045862 | 0.0012961 | positive regulation of proteolysis |
| GO:0070979 | 0.0013827 | protein K11-linked ubiquitination |
| GO:0052548 | 0.0013872 | regulation of endopeptidase activity |
| GO:0070316 | 0.0016172 | regulation of G0 to G1 transition |
| GO:0010038 | 0.0017304 | response to metal ion |
| GO:0001101 | 0.0019230 | response to acid chemical |
| GO:0052547 | 0.0019253 | regulation of peptidase activity |
| GO:0045023 | 0.0021437 | G0 to G1 transition |
| GO:0071345 | 0.0033358 | cellular response to cytokine stimulus |
| GO:0034698 | 0.0034054 | response to gonadotropin |
| GO:0035456 | 0.0034054 | response to interferon-beta |
| GO:0039694 | 0.0034054 | viral RNA genome replication |
| GO:0039703 | 0.0034054 | RNA replication |
| GO:0006978 | 0.0041380 | DNA damage response, signal transduction by p53 class mediator resulting in transcription of p21 class mediator |
| GO:0043065 | 0.0044896 | positive regulation of apoptotic process |
| GO:0043068 | 0.0047411 | positive regulation of programmed cell death |
| GO:0042772 | 0.0049369 | DNA damage response, signal transduction resulting in transcription |
| GO:0046688 | 0.0049369 | response to copper ion |
| GO:0071229 | 0.0054035 | cellular response to acid chemical |
| GO:0048519 | 0.0056520 | negative regulation of biological process |
| GO:0060337 | 0.0057822 | type I interferon signaling pathway |
| GO:0071357 | 0.0057822 | cellular response to type I interferon |
| GO:0072210 | 0.0058007 | metanephric nephron development |
| GO:0010942 | 0.0059608 | positive regulation of cell death |
| GO:0044707 | 0.0060654 | single-multicellular organism process |
| GO:0034340 | 0.0061873 | response to type I interferon |
| GO:1903900 | 0.0065011 | regulation of viral life cycle |
| GO:0045069 | 0.0066088 | regulation of viral genome replication |
| GO:0043281 | 0.0067085 | regulation of cysteine-type endopeptidase activity involved in apoptotic process |
| GO:0001975 | 0.0067283 | response to amphetamine |
| GO:0040007 | 0.0068687 | growth |
*Cellular component
dim(goterms_cc)
[1] 230 3
kable(goterms_cc[1:50, ])
| ID | Pvalue | Terms |
|---|---|---|
| GO:0000152 | 0.0029964 | nuclear ubiquitin ligase complex |
| GO:0032797 | 0.0045608 | SMN complex |
| GO:0097504 | 0.0045608 | Gemini of coiled bodies |
| GO:0031519 | 0.0053634 | PcG protein complex |
| GO:0019035 | 0.0094132 | viral integration complex |
| GO:0035098 | 0.0096514 | ESC/E(Z) complex |
| GO:0034719 | 0.0108718 | SMN-Sm protein complex |
| GO:0005680 | 0.0149138 | anaphase-promoting complex |
| GO:0000151 | 0.0177601 | ubiquitin ligase complex |
| GO:0008043 | 0.0187388 | intracellular ferritin complex |
| GO:0009330 | 0.0187388 | DNA topoisomerase complex (ATP-hydrolyzing) |
| GO:0070288 | 0.0187388 | ferritin complex |
| GO:0001740 | 0.0279776 | Barr body |
| GO:0097440 | 0.0279776 | apical dendrite |
| GO:0043204 | 0.0341473 | perikaryon |
| GO:0061200 | 0.0371304 | clathrin-sculpted gamma-aminobutyric acid transport vesicle |
| GO:0061202 | 0.0371304 | clathrin-sculpted gamma-aminobutyric acid transport vesicle membrane |
| GO:0072562 | 0.0448923 | blood microparticle |
| GO:0000805 | 0.0461979 | X chromosome |
| GO:0060198 | 0.0461979 | clathrin-sculpted vesicle |
| GO:0000276 | 0.0551810 | mitochondrial proton-transporting ATP synthase complex, coupling factor F(o) |
| GO:0000974 | 0.0551810 | Prp19 complex |
| GO:0030870 | 0.0551810 | Mre11 complex |
| GO:0031461 | 0.0556633 | cullin-RING ubiquitin ligase complex |
| GO:0005681 | 0.0571076 | spliceosomal complex |
| GO:0035102 | 0.0640804 | PRC1 complex |
| GO:0015030 | 0.0642553 | Cajal body |
| GO:0048471 | 0.0781674 | perinuclear region of cytoplasm |
| GO:0000790 | 0.0868045 | nuclear chromatin |
| GO:0045263 | 0.0988565 | proton-transporting ATP synthase complex, coupling factor F(o) |
| GO:0031011 | 0.1073490 | Ino80 complex |
| GO:0033202 | 0.1073490 | DNA helicase complex |
| GO:0043025 | 0.1100471 | neuronal cell body |
| GO:0030018 | 0.1123741 | Z disc |
| GO:0008076 | 0.1157623 | voltage-gated potassium channel complex |
| GO:0005912 | 0.1219669 | adherens junction |
| GO:0034705 | 0.1240971 | potassium channel complex |
| GO:0035097 | 0.1247547 | histone methyltransferase complex |
| GO:0070161 | 0.1327431 | anchoring junction |
| GO:0031674 | 0.1342464 | I band |
| GO:0000785 | 0.1405080 | chromatin |
| GO:0031093 | 0.1486384 | platelet alpha granule lumen |
| GO:0033177 | 0.1486384 | proton-transporting two-sector ATPase complex, proton-transporting domain |
| GO:0035861 | 0.1486384 | site of double-strand break |
| GO:0034774 | 0.1566668 | secretory granule lumen |
| GO:0000228 | 0.1581837 | nuclear chromosome |
| GO:0097346 | 0.1646203 | INO80-type complex |
| GO:0044297 | 0.1691931 | cell body |
| GO:1902494 | 0.1702408 | catalytic complex |
| GO:0005925 | 0.1704578 | focal adhesion |
*Molecular function
dim(goterms_mf)
[1] 285 3
kable(goterms_mf[1:50, ])
| ID | Pvalue | Terms |
|---|---|---|
| GO:0005520 | 0.0041803 | insulin-like growth factor binding |
| GO:0004556 | 0.0082278 | alpha-amylase activity |
| GO:0004595 | 0.0082278 | pantetheine-phosphate adenylyltransferase activity |
| GO:0004949 | 0.0082278 | cannabinoid receptor activity |
| GO:0016160 | 0.0082278 | amylase activity |
| GO:0008270 | 0.0089991 | zinc ion binding |
| GO:0061631 | 0.0093864 | ubiquitin conjugating enzyme activity |
| GO:0061650 | 0.0104352 | ubiquitin-like protein conjugating enzyme activity |
| GO:0046914 | 0.0112368 | transition metal ion binding |
| GO:0008201 | 0.0119451 | heparin binding |
| GO:0005178 | 0.0130937 | integrin binding |
| GO:0003883 | 0.0163887 | CTP synthase activity |
| GO:0004140 | 0.0163887 | dephospho-CoA kinase activity |
| GO:0016900 | 0.0163887 | oxidoreductase activity, acting on the CH-OH group of donors, disulfide as acceptor |
| GO:0032090 | 0.0163887 | Pyrin domain binding |
| GO:0047057 | 0.0163887 | vitamin-K-epoxide reductase (warfarin-sensitive) activity |
| GO:1990254 | 0.0163887 | keratin filament binding |
| GO:0005539 | 0.0220635 | glycosaminoglycan binding |
| GO:0004322 | 0.0244834 | ferroxidase activity |
| GO:0005251 | 0.0244834 | delayed rectifier potassium channel activity |
| GO:0008199 | 0.0244834 | ferric iron binding |
| GO:0016724 | 0.0244834 | oxidoreductase activity, oxidizing metal ions, oxygen as acceptor |
| GO:1990050 | 0.0244834 | phosphatidic acid transporter activity |
| GO:0000774 | 0.0325124 | adenyl-nucleotide exchange factor activity |
| GO:0003918 | 0.0325124 | DNA topoisomerase type II (ATP-hydrolyzing) activity |
| GO:0005212 | 0.0325124 | structural constituent of eye lens |
| GO:0061505 | 0.0325124 | DNA topoisomerase II activity |
| GO:0003696 | 0.0404761 | satellite DNA binding |
| GO:0008494 | 0.0404761 | translation activator activity |
| GO:0055131 | 0.0404761 | C3HC4-type RING finger domain binding |
| GO:0003916 | 0.0483751 | DNA topoisomerase activity |
| GO:0019215 | 0.0483751 | intermediate filament binding |
| GO:0030368 | 0.0483751 | interleukin-17 receptor activity |
| GO:0070087 | 0.0483751 | chromo shadow domain binding |
| GO:0008083 | 0.0485652 | growth factor activity |
| GO:0050839 | 0.0540124 | cell adhesion molecule binding |
| GO:0044877 | 0.0550545 | macromolecular complex binding |
| GO:0019237 | 0.0562100 | centromeric DNA binding |
| GO:1901681 | 0.0605070 | sulfur compound binding |
| GO:0008144 | 0.0635227 | drug binding |
| GO:0004862 | 0.0639812 | cAMP-dependent protein kinase inhibitor activity |
| GO:0008139 | 0.0639812 | nuclear localization sequence binding |
| GO:0042826 | 0.0680492 | histone deacetylase binding |
| GO:0008301 | 0.0716892 | DNA binding, bending |
| GO:0090079 | 0.0716892 | translation regulator activity, nucleic acid binding |
| GO:0048038 | 0.0793346 | quinone binding |
| GO:0051787 | 0.0793346 | misfolded protein binding |
| GO:0061630 | 0.0846794 | ubiquitin protein ligase activity |
| GO:0008656 | 0.0869178 | cysteine-type endopeptidase activator activity involved in apoptotic process |
| GO:0016722 | 0.0869178 | oxidoreductase activity, oxidizing metal ions |
sessionInfo()
R version 3.2.1 (2015-06-18)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.10.4 (Yosemite)
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] grid stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] biomaRt_2.24.1 gridExtra_2.0.0 broman_0.59-5 zoo_1.7-12
[5] ggplot2_1.0.1 edgeR_3.10.5 limma_3.24.15 dplyr_0.4.3
[9] data.table_1.9.6 knitr_1.11
loaded via a namespace (and not attached):
[1] Rcpp_0.12.1 highr_0.5.1 GenomeInfoDb_1.4.3
[4] formatR_1.2.1 plyr_1.8.3 bitops_1.0-6
[7] tools_3.2.1 digest_0.6.8 RSQLite_1.0.0
[10] evaluate_0.8 gtable_0.1.2 lattice_0.20-33
[13] DBI_0.3.1 yaml_2.1.13 parallel_3.2.1
[16] proto_0.3-10 stringr_1.0.0 IRanges_2.2.9
[19] S4Vectors_0.6.6 stats4_3.2.1 Biobase_2.28.0
[22] R6_2.1.1 AnnotationDbi_1.30.1 XML_3.98-1.3
[25] rmarkdown_0.8.1 reshape2_1.4.1 magrittr_1.5
[28] BiocGenerics_0.14.0 scales_0.3.0 htmltools_0.2.6
[31] MASS_7.3-44 assertthat_0.1 colorspace_1.2-6
[34] labeling_0.3 stringi_0.5-5 RCurl_1.95-4.7
[37] munsell_0.4.2 chron_2.3-47